The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import pandas_profiling
import pydotplus as pydot
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix # for model validation scores
from sklearn.feature_extraction.text import CountVectorizer # DT does not take strings as input for the model fit step
from yellowbrick.classifier import ClassificationReport, ROCAUC
import missingno as msno_plot # for plotting missing values
from os import system
from IPython.display import display # for displaying multiple data frames in one output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
plt.style.use('ggplot')
pd.options.display.float_format = '{:,.4f}'.format
df = pd.read_csv("bank-full.csv")
print(df.columns)
print()
print(df.shape)
print()
print(df.info())
print()
print(df.size)
print()
pd.concat([df.head(5),df.tail(5)])
newOrder=['age','balance','day','duration','campaign','pdays','previous','job','marital','education','default','housing','loan','contact','month','poutcome','Target']
df=df[newOrder]
pd.concat([df.head(5),df.tail(5)])
print("General information:")
df.info()
print()
print("Unique values per columns:")
df.nunique()
df.describe().T
df.describe(include='object')
General Findings & Comments:
Actions:
# Replacing pdays value of -1 with value of 0
df['pdays']=df['pdays'].replace(-1, 0)
df.describe().T
print("Percent of missing data:")
print(df.isnull().mean()*100)
print()
print("Visual representation of missing data:")
msno_plot.bar(df);
None Necessary
# check for outliers in attributes with continuous numerical values
col_num= ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
plt.figure(figsize=(20,15))
pos = 1
for i in col_num:
plt.subplot(6, 7, pos)
sns.boxplot(df[i])
pos += 1
print()
for i in col_num:
plt.subplot(5, 7, pos)
sns.distplot(df[i])
pos += 1
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
# focused analysis on attributes with Q1 = Q3 vs Target
sns.scatterplot(df['previous'], df['pdays'], hue=df['Target'], palette='Set2')
sns.distplot(df[df['Target']=="no"]['previous'],color='r',label="no")
sns.distplot(df[df['Target']=="yes"]['previous'],color='g',label="yes")
plt.legend()
plt.show()
sns.distplot(df[df['Target']=="no"]['pdays'],color='r',label="no")
sns.distplot(df[df['Target']=="yes"]['pdays'],color='g',label='yes')
plt.legend()
plt.show()
df.groupby(['Target','pdays'])['previous'].value_counts()
df.groupby(['Target','previous'])['pdays'].value_counts()
Outlier Treatement for 'pdays' and 'previous' - remove columns entirely as > 90% of values where these are 0, "Target" = "yes", meaning that for more than 90% of time, a client subscribed without any prior / recent contact.
For all other attributes, remove outliers per 1.5XQ1/Q3 rules.
# image included for context
# image acquired from blog post "Understanding Boxplots" by Michael Galarnyk at
# https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51
from IPython.display import Image
Image("boxplotOutliersDist.jpg")
# remove outliers
for i in col_num:
q1, q2, q3 = df[i].quantile([0.25,0.5,0.75])
IQR = q3 - q1
lower_cap=q1-1.5*IQR
upper_cap=q3+1.5*IQR
df[i]=df[i].apply(lambda x: upper_cap if x>(upper_cap) else (lower_cap if x<(lower_cap) else x))
# confirm outliers were removed
plt.figure(figsize=(20,15))
pos = 1
for i in col_num:
plt.subplot(6, 7, pos)
sns.boxplot(df[i])
pos += 1
print()
for i in col_num:
plt.subplot(5, 7, pos)
sns.distplot(df[i])
pos += 1
# drop 'pdays' and 'prevoius' columns
col_drop= ['pdays', 'previous']
df=df.drop(labels=col_drop, axis='columns')
df.describe().T
#df = df.drop(['pdays','previous'],axis=1)
General Findings & Comments:
Actions:
Attributes that may require Binning (over 12 unique values)
Attributes that may require conversion binary values of 0 or 1 for yes/no values
Attributes that may require ordinal/numerical encodings
Attributes that may require one-hot encodings
for feature in df.columns: # Loop through all columns in the dataframe
if df[feature].dtype == 'object': # Only apply for columns with categorical strings
df[feature] = pd.Categorical(df[feature])# Replace strings with an integer
print(df.info())
pd.concat([df.head(5),df.tail(5)])
By Attribute:
# counts of distinct vaues in categorical attributes
print('For job:')
print(df.job.value_counts())
print()
print('For marital:')
print(df.marital.value_counts())
print()
print('For education:')
print(df.education.value_counts())
print()
print('For default:')
print(df.default.value_counts())
print()
print('For housing:')
print(df.housing.value_counts())
print()
print('For loan:')
print(df.loan.value_counts())
print()
print('For contact:')
print(df.contact.value_counts())
print()
print('For month:')
print(df.month.value_counts())
print()
print('For poutcome:')
print(df.poutcome.value_counts())
print()
print('For Target:')
print(df.Target.value_counts())
print()
# need to: create iterative function
df.pivot_table(index='job', columns='Target', values='age').astype('int')
#turning numbers into integers
df.pivot_table(index='job', columns='Target', values='age', aggfunc='count')
df.pivot_table(index='marital', columns='Target', values='age').astype('int')
df.pivot_table(index='marital', columns='Target', values='age', aggfunc='count')
df.pivot_table(index='education', columns='Target', values='age').astype('int')
df.pivot_table(index='education', columns='Target', values='age', aggfunc='count')
replaceStruct = {
"month": {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12},
"poutcome": {"failure": 0, "unknown": 1, "other": 2, "success": 3},
"default": {"no": 1, "yes": 0 },
"housing": {"no": 0, "yes": 1 },
"loan": {"no": 0, "yes": 1 },
"education": {"unknown": 0, "primary": 1, "secondary": 2, "tertiary": 3 },
#"marital": {"divorced": 0, "single": 1, "married": 2 },
"contact": {"unknown": 0, "telephone": 1, "cellular": 2 },
#"job": {}
"Target": {"no": 0, "yes": 1 }
}
ohe_cols=["job","marital"]
df=df.replace(replaceStruct)
df=pd.get_dummies(df, columns=ohe_cols)
df.info()
pd.concat([df.head(5),df.tail(5)])
dfnum=df[['age', 'balance', 'day', 'duration', 'campaign']]
dfnum.describe().T
dfnum.nunique()
# distributions of numerical (continuous) attributes
fig, axs = plt.subplots(ncols = 5, figsize = (20, 5))
bins=5
sns.distplot(df['age'], bins=bins, ax = axs[0])
sns.distplot(df['balance'], bins=bins, ax = axs[1])
sns.distplot(df['day'], bins = bins, ax = axs[2])
sns.distplot(df['duration'], bins = bins, ax = axs[3])
sns.distplot(df['campaign'], bins=bins, ax = axs[4]);
#sns.distplot(df['pdays'], bins=30, ax = axs[5]);
#sns.distplot(df['previous'], bins=30,ax = axs[6]);
# Boxplots
col_num= ['age', 'balance', 'day', 'duration', 'campaign']
plt.figure(figsize=(29,5))
pos = 1
for i in col_num:
plt.subplot(5, 7, pos)
sns.boxplot(df[i])
pos += 1
df.describe().T
df.columns
# distributions of numerical (continuous) attributes
fig, axs = plt.subplots(ncols = 5, figsize = (30, 7))
bins=5
sns.distplot(df['age'], bins=6, ax = axs[0])
sns.distplot(df['balance'], bins= bins,ax = axs[1])
sns.distplot(df['day'], bins = 4, ax = axs[2])
sns.distplot(df['duration'], bins = 11, ax = axs[3])
sns.distplot(df['campaign'], bins = 2,ax = axs[4]);
#sns.distplot(df['pdays'], bins=bins, ax = axs[5]);
#sns.distplot(df['previous'], bins=bins,ax = axs[6]);
# distributions of ordinal attributes
fig, axs = plt.subplots(ncols = 5, figsize = (30, 7))
bins=5
sns.distplot(df['month'], bins = bins, ax = axs[3])
sns.distplot(df['default'], bins=2, ax = axs[0])
sns.distplot(df['housing'], bins = 2, ax = axs[1])
sns.distplot(df['loan'], bins = 2,ax = axs[2])
sns.distplot(df['poutcome'], bins = 4, ax = axs[4]);
df.info()
df.describe().T
# correlations of Numerical / Continuous attributes and Target
df1=df[['age', 'balance', 'day', 'duration', 'campaign','Target']]
sns.heatmap(df1.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
df1.corr()
# correlations of One Hot Encoded 'job' attributes and Target part 1
df2=df[['job_admin.', 'job_blue-collar','job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired','Target']]
plt.figure(figsize=(10,8))
sns.heatmap(df2.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
df2.corr()
# correlations of One Hot Encoded 'job' attributes and Target part 2
df3=df[['job_self-employed', 'job_services', 'job_student', 'job_technician','job_unemployed', 'job_unknown','Target']]
plt.figure(figsize=(10,8))
sns.heatmap(df3.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
df3.corr()
# correlations of One Hot Encoded 'marital' attributes and Target
df4=df[[ 'marital_divorced',
'marital_married', 'marital_single','Target']]
plt.figure(figsize=(10,8))
sns.heatmap(df4.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show()
df4.corr()
sns.pairplot(df1, hue="Target", palette="husl")
sns.pairplot(df2, hue="Target", palette="husl")
sns.pairplot(df3, hue="Target", palette="husl")
sns.pairplot(df4, hue="Target", palette="husl")
General Findings & Comments
xxx
df.columns
df = df.drop([ 'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
'job_management', 'job_retired', 'job_self-employed', 'job_services',
'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
'marital_divorced', 'marital_married', 'marital_single'],axis=1)
df_logr=df # dataframe for logistic regression model
df_tree=df # dataframe for decision tree model
df_forest=df # dataframe for random forest model
df_bag=df # dataframe for bagging model
df_ada=df # dataframe for ada boost model
df_gradb=df # dataframe for gradient boost model
X = df_logr.drop(['Target'], axis=1)
Y = df_logr[['Target']]
##Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,random_state=42)
print("{0:0.2f}% data is in training set".format((len(X_train)/len(df_logr.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(X_test)/len(df_logr.index)) * 100))
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score, accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=42);
logreg.fit(X_train, y_train);
logreg
logreg.coef_
logreg.intercept_
y_predict = logreg.predict(X_test)
y_predict = logreg.predict(X_train) #Prediction here are 0s and 1s
y_predict_prob = logreg.predict_proba(X_train) #to get probability values
y_predict = logreg.predict(X_train) # Prediction here are 0s and 1s
y_predict_prob = logreg.predict_proba(X_train)
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted):
cm = confusion_matrix(actual, predicted)
print(cm)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
confusion_matrix(y_train,y_predict)
# Regression Score
print("Trainig accuracy",logreg.score(X_train,y_train))
print()
print("Testing accuracy",logreg.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_train,y_predict))
print()
print("Recall:",recall_score(y_train,y_predict))
print()
print("Precision:",precision_score(y_train,y_predict))
print()
print("F1 Score:",f1_score(y_train,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_train,y_predict))
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_train, logreg.predict(X_train))
fpr, tpr, thresholds = roc_curve(y_train, logreg.predict_proba(X_train)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# print thresholds
fpr, tpr, thresholds =roc_curve(y_train, logreg.predict_proba(X_train)[:,1])
FPR=pd.DataFrame(fpr)
FPR.rename(columns = {0:'False Positive Rate'}, inplace = True)
TRP=pd.DataFrame(tpr)
TRP.rename(columns = {0:'True Positive Rate'}, inplace = True)
THRESHOLD=pd.DataFrame(thresholds)
THRESHOLD.rename(columns = {0:'Threshold'}, inplace = True)
result = pd.concat([FPR, TRP, THRESHOLD], axis=1, sort=False)
result.head(10)
# Goal/Objective = threshold resulting in high TPR is high and FPR is low / where tpr - (1-fpr) is zero or near to zero is the optimal cut off point
import pylab as pl
i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i)})
roc.iloc[(roc.tf-0).abs().argsort()[:1]]
# Plot tpr vs 1-fpr
fig, ax = pl.subplots()
pl.plot(roc['tpr'])
pl.plot(roc['1-fpr'], color = 'red')
pl.xlabel('1-False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic')
ax.set_xticklabels([])
display(roc.head())
roc[(roc['tpr']>0.6) & (roc['tpr']<0.7)].head(50)
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
THRESHOLD = 0.734359
preds = np.where(clf.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)
pd.DataFrame(data=[accuracy_score(y_test, preds), recall_score(y_test, preds),
precision_score(y_test, preds), roc_auc_score(y_test, preds)],
index=["accuracy", "recall", "precision", "roc_auc_score"])
logreg.get_params() # check parameters of logistic regression. If we dont specify the parameters in the model it takes default value
# Run loop to check different values of 'solver'. Note that solver can only be used with l2, only 'liblinear' works with both 'l1' and 'l2','liblinear','sag','saga'
train_score=[]
test_score=[]
solver = ['newton-cg','lbfgs', 'liblinear','sag','saga']
for i in solver:
model = LogisticRegression(random_state=42, penalty='l2', C = 0.75,solver=i) # changing values of solver
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train, y_train),3))
test_score.append(round(model.score(X_test, y_test),3))
print(solver)
print()
print(train_score)
print()
print(test_score)
model = LogisticRegression(random_state=42,penalty='l2') # change l1 penalty
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
model = LogisticRegression(random_state=42, penalty='l2',class_weight='balanced') # changing class weight to balanced
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
# Run loop to check different values of 'C'
train_score=[]
test_score=[]
C = [0.01,0.1,0.25,0.5,0.75,1]
for i in C:
model = LogisticRegression(random_state=42,penalty='l2', class_weight='balanced', C=i) # changing values of C
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train,y_train),3)) # appending training accuracy in a blank list for every run of the loop
test_score.append(round(model.score(X_test, y_test),3)) # appending testing accuracy in a blank list for every run of the loop
print(C)
print()
print(train_score)
print()
print(test_score)
# Final model
model = LogisticRegression(random_state=42,penalty='l2', class_weight='balanced',C=0.25)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
Note - In Decision Trees, we need not remove highly correlated variables as nodes are divided into sub nodes using one independent variable only, hence even if two or more variables are highly correlated, the variable producing the highest information gain will be used for the analysis
# splitting data into training and test set for independent attributes
from sklearn.model_selection import train_test_split
features = [col for col in df_tree.columns if col != 'Target']
X_train, X_test, y_train, y_test = train_test_split(df_tree[features], df_tree['Target'], test_size=.3, random_state=22)
X_train.shape, X_test.shape
# Invoke the decision tree classifier function.
# create gini and entropy methods of finding the split columns
model_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
model_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
model_entropy.fit(X_train, y_train)
model_gini.fit(X_train, y_train)
treeObj = model_entropy.tree_
print ('Total Nodes:',treeObj.node_count)
print('Depth:',model_entropy.get_depth())
print('Total Leaves:',model_entropy.get_n_leaves())
treeObj = model_gini.tree_
print ('Total Nodes:',treeObj.node_count)
print('Depth:',model_gini.get_depth())
print('Total Leaves:',model_gini.get_n_leaves())
print("Entropy Model Train: %.2f" % model_entropy.score(X_train, y_train)) # performance on train data
print("Entropy Model Test: %.2f" % model_entropy.score(X_test, y_test)) # performance on test data
print("Gini Model Train: %.2f" % model_gini.score(X_train, y_train)) # performance on train data
print("Gini Model Test: %.2f" % model_gini.score(X_test, y_test)) # performance on test data
There is a high degree of overfitting in the model due to which the test accuracy drops drastically. This shows why decision trees are prone to overfitting.
Regularize/prune the decision tree by limiting the max. depth of trees and print the accuracy.
There is no statistical significance on the values of depth and minimum samples that is considered below. The idea is to prune a tree and check how pruning impacts accuarcy.
clf_pruned = DecisionTreeClassifier(criterion = "gini", max_depth=5, min_samples_leaf=5, random_state=42)
clf_pruned.fit(X_train, y_train)
print("Train: %.2f" % clf_pruned.score(X_train, y_train)) # performance on train data
print("Test: %.2f" % clf_pruned.score(X_test, y_test)) # performance on test data
import io
from io import StringIO
from sklearn.tree import export_graphviz
from IPython.display import Image
import pydotplus
import graphviz
feature_cols = X_train.columns
dot_data = StringIO()
export_graphviz(clf_pruned, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['5','6','8'])
from pydotplus import graph_from_dot_data
#(graph, ) = graph_from_dot_data(dot_data.getvalue())
graph = graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
preds_train = clf_pruned.predict(X_train)
preds_test = clf_pruned.predict(X_test)
acc_DT = accuracy_score(y_test, preds_test)
# Confusion matrix
pd.crosstab(y_test, preds_test, rownames=['Actual'], colnames=['Predicted'])
# Model validation Score
print("Trainig accuracy",clf_pruned.score(X_train,y_train))
print()
print("Testing accuracy",clf_pruned.score(X_test, y_test))
print()
print("----------------------Micro-----------------------------------------")
print("Recall:",recall_score(y_test,preds_test, average="micro"))
print()
print("Precision:",precision_score(y_test,preds_test, average="micro"))
print()
print("----------------------Macro-----------------------------------------")
print("Recall:",recall_score(y_test,preds_test, average="macro"))
print()
print("Precision:",precision_score(y_test,preds_test, average="macro"))
print()
# Visualize model performance with yellowbrick library
viz = ClassificationReport(DecisionTreeClassifier(criterion = "gini", max_depth=3, min_samples_leaf=4))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(DecisionTreeClassifier(criterion = "gini", max_depth=3, min_samples_leaf=4))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
## calculate feature importance
feat_importance = clf_pruned.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(features, clf_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False).head()
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Decision Tree - Gini - Pruned'], 'accuracy': acc_DT})
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 10, max_samples=0.8, random_state=42)
rfcl = rfcl.fit(X_train, y_train)
rfcl
pred_RF = rfcl.predict(X_test)
acc_RF = accuracy_score(y_test, pred_RF)
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
## Calculating feature importance
feat_importance = rfcl.feature_importances_
feat_imp_dict = dict(zip(features, rfcl.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
rfcl.estimators_[5]
dot_data = StringIO()
export_graphviz(rfcl.estimators_[5], out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = features,class_names=['5','6','8'])
from pydotplus import graph_from_dot_data
#(graph, ) = graph_from_dot_data(dot_data.getvalue())
graph = graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=70, max_samples= .8, bootstrap=True, random_state=22)
#------bootstrap=True signifies that entire sample is not used to develop a tree
bgcl = bgcl.fit(X_train, y_train)
pred_BG = bgcl.predict(X_test)
acc_BG = accuracy_score(y_test, pred_BG)
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators = 100, learning_rate=0.1, random_state=22)
abcl = abcl.fit(X_train, y_train)
# n_estimator - The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.
pred_AB =abcl.predict(X_test)
acc_AB = accuracy_score(y_test, pred_AB)
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'accuracy': [acc_AB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(X_train, y_train)
pred_GB = gbcl.predict(X_test)
acc_GB = accuracy_score(y_test, pred_GB)
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'accuracy': [acc_GB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
x=model_entropy.predict_proba(X_train)
x[1,1]
x=clf_pruned.predict_proba(X_test)
x[250,1]
from sklearn.model_selection import GridSearchCV
params = {'criterion':['gini','entropy'],'max_samples':[0.7,0.75,0.6,0.8],'bootstrap':[True],'n_estimators':list(range(10,120,20))} #------------------Setting parameter values for Decision Tree
grid_search_model = GridSearchCV(RandomForestClassifier(random_state=42), params, verbose=1, cv=3) #---Cross validation is 3
grid_search_model.fit(X_train, y_train)
#-------------------------Getting the estimator the returns the best fit model based on accuracy score--------------------------
grid_search_model.best_estimator_
grid_search_model.best_params_
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 70, random_state=42, criterion = 'entropy', max_samples=0.8)
rfcl = rfcl.fit(X_train, y_train)
display(rfcl)
pred_RF = rfcl.predict(X_test)
acc_RF = accuracy_score(y_test, pred_RF)
tempResultsDf = pd.DataFrame({'Method':['Random Forest-Grid Search'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# screenshot of Final Logistic Regression model performance for reference / comparison against other models
from IPython.display import Image
Image("lr_finalModelScore.jpg")
Highest Accuracy = "Random Forest - Grid Search" Lowest Accuracy = Logistic Regression followed by Adaboost Delta between Highest and Lowest Accuracy = .12
Run - Decision Tree Gini / Pruned Model for predeictions for its simplicity and interpretability compared to the complexity of all ensemble methods for marginal gains in performance.